# 07_postcluster_map_regressions.R
# Maps cluster membership for example PHAs and estimates ordered
# logit of cluster membership
# Produces:
# - Figure A1: Map of PHA characteristics + cluster memberships for select PHAs
# - Figure A2. Proportion of Housing Authorities with Preferences by Size
# - Table S10: Results for multinomial regression predicting cluster membership

library(tidyverse)
library(here)
library(ggthemes)
library(ggrepel)
library(ggsci)
library(nnet)
library(stargazer)
options(scipen=999)

# Load data on PHAs in sample
pha_pref_ref <- readRDS(here("data", "pha_pref_ref.RDS"))
# Load data on PHA characteristics
pha_all <-  readRDS(here("data", "pha_all.RDS"))
# Load cluster assignments
pref_pr <-  readRDS(here("data", "intermediate", 
                 "cluster_assignments.RDS"))

# Load PHA lat/long for later map
phas_gdp <- sf::st_read(dsn = 
                          here("data", "PHAs.gdb"))

######################################################
# Load preference summaries and PHA characteristics  #
######################################################

# Subset to only PHAs in sample and join
# on cluster assignment and PHA characteristics
pha_character <- pha_pref_ref %>%
  filter(in_sample_abt == TRUE) %>%
  arrange(pha_code) %>%
  # Join on cluster assignments 
  left_join(pref_pr, by = "pha_code") %>%
  left_join(pha_all, by = "pha_code") %>%
  mutate(cluster = as.factor(cluster_num)) %>%
  rename(acs_ssisnap = 
           derived_acs_living_in_household_with_supplemental_security_income_ssi_cash_public_assistance_income_or_food_stampssnap_in_the_past_12_months_percent) %>%
  # Log skewed variables
  mutate(log_hud_med_month_vouchers = log(hud_med_month_vouchers),
         log_hud_count_all_units = log(hud_count_all_units),
         log_median_hh_income = log(derived_median_hh_income),
         local_coc = ifelse(coc_category_noimpute == "Local COC", TRUE, FALSE))


# Generate Figure A2. Proportion of Housing Authorities with Preferences 
# by Housing Choice Voucher Program Size
pha_character %>%
  mutate(voucher_cuts = cut_number(hud_med_month_vouchers, n = 10)) %>%
  group_by(voucher_cuts) %>%
  summarize(prop_preferences = mean(any_preferences)) %>%
  mutate(quartile = 1:10,
         quartile_lab = paste0("Q", 1:10)) %>%
  ggplot(aes(x = reorder(quartile, quartile_lab), y = prop_preferences)) +
  geom_bar(stat = "identity") + 
  theme_minimal() + 
  labs(x = "Decile of median # of Housing Choice Vouchers administered monthly", 
       y = "Proportion with preferences")

ggsave(here("figs", "prop_preferences_by_decile.png"),
       plot = last_plot(),
       device = "png",
       dpi = 200)

######################################
# Map w/ observed cluster membership #
######################################

# Get around 12 examples 
set.seed(919)
other_ex <- pha_character %>%
  filter(!cluster %in% c(6)) %>%
  group_by(cluster) %>%
  sample_n(2) 

always_ex <- c("New York City Housing Authority",
               "Housing Authority of the City of Los Angeles",
               "Chicago Housing Authority",
               "Housing Authority of Portland")  

rel_vars <- c("hud_med_month_vouchers",
              "local_coc",
              "repub_2016",
              "derived_median_hh_income",
              "derived_acs_not_hispanic_or_latinowhite_alone_percent")

clusters_toplot <- pha_character %>%
  filter(pha_name %in% c(always_ex, other_ex$pha_name)) %>%
  mutate(pha_name_clean = 
           case_when(grepl("Los Angeles", pha_name) ~ "Los Angeles",
                     grepl("Fresno", pha_name) ~ "Fresno",
                     grepl("Hernando", pha_name) ~ "Hernando County, FL",
                     grepl("Chicago", pha_name) ~ "Chicago",
                     grepl("Detroit", pha_name) ~ "Detroit",
                     grepl("New York City", pha_name) ~ "NYC",
                     grepl("Portland", pha_name) ~ "Portland, OR",
                     grepl("Quanah", pha_name) ~ "Quanah, TX",
                     grepl("Wichita", pha_name) ~ "Wichita Falls, TX",
                     grepl("Richmond", pha_name) ~ "Richmond, VA",
                     grepl("Albemarle", pha_name) ~ "Albemarle, VA",
                     grepl("Barron", pha_name) ~ "Barron County, WI",
                     grepl("Duluth", pha_name, ignore.case = TRUE) ~ "Duluth, MN",
                     grepl("Crossville", pha_name) ~ "Crossville, TN",
                     TRUE ~ "Other"), 
         label = sprintf("%s (Cluster %s)\n%s vouchers, %s local CoC\n%s%% Republican vote;\n%s%% White\n$%s median income",
                         pha_name_clean,
                         cluster,
                         scales::comma(round(hud_med_month_vouchers)),
                         ifelse(local_coc, "Yes", "No"),
                         round(repub_2016 * 100, 0),
                         round(derived_acs_not_hispanic_or_latinowhite_alone_percent * 100, 0),
                         scales::comma(round(derived_median_hh_income)))) 

# Left join onto pha/lat long
cluster_winfo_wlat <- clusters_toplot %>%
  left_join(phas_gdp %>% select(PARTICIPANT_CODE, LAT, LON),
            by = c("pha_code" = "PARTICIPANT_CODE"))

# Load states
states <- map_data("state")
colors_map <- colorblind_pal()(length(unique(cluster_winfo_wlat$cluster)))

# Generate Figure A1s
ggplot() +
  geom_polygon(data = states, aes(x = long, y = lat, 
                                        group = group),
               color = "black", fill = "grey92") +
  expand_limits(x = states$long, y = states$lat) +
  theme_void() +
  geom_point(data = cluster_winfo_wlat, 
             aes(x = LON, y = LAT)) +
  geom_label_repel(data = cluster_winfo_wlat,
                   aes(x = LON, y = LAT, 
                       label = label,
                       color = cluster),
                   size = 4) +
  scale_color_npg() +
  guides(color = "none")

ggsave(here("figs", "map_pred_cluster.png"), 
       plot = last_plot(),
       device = "png", width = 12, height = 8,
       dpi = 300)

##############################
# Predict cluster membership # 
# using multinomial logistic #
##############################

# Specify variables to include
all_vars <- c("log_hud_med_month_vouchers",
              "log_hud_count_all_units",
              "derived_acs_not_hispanic_or_latinowhite_alone_percent", 
              "derived_acs_not_hispanic_or_latinoblack_or_african_american_alone_percent", 
              "derived_acs_not_hispanic_or_latinoasian_alone_percent",
              "derived_acs_hisp_any_perc",
              "derived_acs_veteran_percent", "acs_ssisnap",
              "derived_acs_below_100_percent_of_the_poverty_level_percent", 
              "derived_acs_in_the_labor_forceunemployed_percent",
              "derived_acs_renter_occupied_percent", "log_median_hh_income", 
              "derived_median_rental_burden", "derived_acs_vacant_percent",
              "urban_affordable_noassist_per100", 
              "urban_affordable_per100", 
              "human_services_densper_1000_inpov", 
              "sharkey_commorg_densper_1000_inpov", 
              "perc_tracts_urban", "repub_2016", "census_region", "local_coc"
)

# Standardize continuous vars for coef comparability
non_num <- c("local_coc", "census_region")
pha_scaled <- pha_character %>%
  mutate_at(setdiff(all_vars, non_num), ~scale(.) %>% as.vector)

# Estimate model
fit_scaled <- multinom(as.formula(paste0("cluster ~", 
                                         paste(all_vars, collapse = "+"))), 
                       data = pha_scaled)

# Generate Table S10 
stargazer(fit_scaled, report = "vcsp*", star.cutoffs = c(0.05, 0.01, 0.001))
